#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import os
import sys
import json
import MySQLdb
import datetime
import time
import random

reload(sys)
sys.setdefaultencoding('utf-8')

images_path = "/data/images/"
taobao_url = "https://mm.taobao.com/json/request_top_list.htm"
personal_url = "https://mm.taobao.com/self/info/model_info_show.htm"

year = datetime.datetime.now().strftime("%Y")
conn = MySQLdb.connect(host="localhost", user="root", passwd="password", port=3306, db='taobao', connect_timeout = 5, charset='utf8')
curs = conn.cursor()

def getUserList(p):
    url = taobao_url + "?page=" +  str(p)
    try:
        content = getContent( url )
        soup = BeautifulSoup(content, 'lxml')

        reg_url = '[^//][^\s]*'
        reg_int = '[1-9]\d*'

        #如果未找到class，跳出循环进行下一次循环
        if not soup.select('.list-item'):
            return False

        for item in soup.select('.list-item'):

            #个人URL
            profile_url = "https:"+item.select('.lady-avatar')[0]['href']
            #姓名
            username = item.select(".lady-name")[0].get_text()
            #年龄
            age = item.select("em strong")[0].get_text()
            #地址
            addr = item.select("span")[0].get_text()
            #USERID
            user_id = re.findall(re.compile(r'[1-9]\d*'),profile_url)[0]
            #粉丝数
            fans = item.select("em strong")[1].get_text()

            personalurl = personal_url + "?user_id=" + str(user_id)
            personal_content = getContent( personalurl )
            soup_ = BeautifulSoup( personal_content, 'lxml' )

            birthday_text = soup_.select(".mm-p-base-info ul li")[1].select("span")[0].get_text()
            pattern_int = re.compile(r'\d+')
            birthday = str( int(year) - int(age) ) + "-" + '-'.join(re.findall(pattern_int,birthday_text))

            city = soup_.select(".mm-p-base-info ul li")[2].select("span")[0].get_text()
            jobs = soup_.select(".mm-p-base-info ul li")[3].select("span")[0].get_text()
            blood_type = soup_.select(".mm-p-base-info ul li")[4].select("span")[0].get_text()
            school = soup_.select(".mm-p-base-info ul li")[5].select("span")[0].get_text()
            style = soup_.select(".mm-p-base-info ul li")[6].select("span")[0].get_text()
            height = soup_.select(".mm-p-base-info ul li")[7].select("p")[0].get_text()
            weight = soup_.select(".mm-p-base-info ul li")[8].select("p")[0].get_text()
            size = soup_.select(".mm-p-base-info ul li")[9].select("p")[0].get_text()
            bar = soup_.select(".mm-p-base-info ul li")[10].select("p")[0].get_text()
            shose = soup_.select(".mm-p-base-info ul li")[11].select("p")[0].get_text()
            experience = soup_.select(".mm-p-experience-info p")[0].get_text()

            param = (int(user_id), username, int(age), birthday, city, int(fans), profile_url, jobs, blood_type, school, style, height, weight, size, bar, shose, experience)
            sql = "insert into mm (user_id, user_name, age, birthday, city, fans, url, jobs, blood_type, school, style, height, weight, size, bar, shose, experience) values ( %d, '%s', %d, '%s', '%s', %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )" % param

            mysql_execute( sql )
            #saveFile( personal_path, username, pattern_info )

    except Exception,e:
        print e


#获取网页内容
def getContent( url ):
    try:
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read().decode('gbk')
    except Exception, e:
        print e

#保存网络图片
def saveImage( album_path, pic_name, pic_url ):
    pic = urllib.urlopen(pic_url).read()
    file = open( album_path + "/" + pic_name + ".jpg", 'wb')
    file.write(pic)
    file.close()

#保存文件
def saveFile(file_path, name, content):
    file_name = file_path + "/" + name + '.txt'
    file = open( file_name, "a+" )
    file.write( content.encode('utf-8') )

#创建目录
def mkdir(path):
    path = path.strip()
    isExists = os.path.exists( path )
    if not isExists:
        os.makedirs( path )

def mysql_execute( sql):
    curs.execute(sql)
    conn.commit()

if __name__ == "__main__":
    for p in range(1, 4316):
        getUserList(p)
        time.sleep(random.randint(0,2))

    curs.close()
    conn.close()